{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import time\n",
    "import random\n",
    "from lxml import etree"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 影信息通过动态加载，所有的信息都藏在基础网页，唯一变动的是start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "url1 = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=0'\n",
    "url2 = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=20'\n",
    "url3 = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=40'\n",
    "url4 = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=60'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构造爬取的网址"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#构造网页\n",
    "def format_url(num):\n",
    "    urls = []\n",
    "    base_url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start={}'\n",
    "    for i in range(0,20 * num,20):\n",
    "        url = base_url.format(i)\n",
    "        urls.append(url)\n",
    "    return urls\n",
    "\n",
    "#这里是爬取10页，可以自行更改参数\n",
    "urls = format_url(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 伪装请求头"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  解析单页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def parse_base_info(url,headers):\n",
    "    html = requests.get(url,headers = headers)   \n",
    "    bs = json.loads(html.text)\n",
    "    df = pd.DataFrame()\n",
    "    for i in bs['data']:\n",
    "        casts = i['casts']  #主演\n",
    "        cover = i['cover']  #海报\n",
    "        directors = i['directors']  #导演\n",
    "        m_id = i['id']  #ID\n",
    "        rate = i['rate'] #评分\n",
    "        star = i['star'] #标记人数 \n",
    "        title = i['title']  #片名\n",
    "        url = i['url']  #网址\n",
    "        cache = pd.DataFrame({'主演':[casts],'海报':[cover],'导演':[directors],\n",
    "                              'ID':[m_id],'评分':[rate],'标记':[star],'片名':[title],'网址':[url]})\n",
    "        df = pd.concat([df,cache])\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 循环批量爬取电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I had crawled page of:1\n",
      "I had crawled page of:2\n",
      "I had crawled page of:3\n",
      "I had crawled page of:4\n",
      "I had crawled page of:5\n",
      "I had crawled page of:6\n",
      "I had crawled page of:7\n",
      "I had crawled page of:8\n",
      "I had crawled page of:9\n",
      "I had crawled page of:10\n"
     ]
    }
   ],
   "source": [
    "result = pd.DataFrame()\n",
    "\n",
    "count = 1\n",
    "for url in urls:\n",
    "    df = parse_base_info(url,headers = headers)\n",
    "    result = pd.concat([result,df])\n",
    "    time.sleep(random.random() + 2)\n",
    "    print('I had crawled page of:%d' % count)\n",
    "    count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>主演</th>\n",
       "      <th>导演</th>\n",
       "      <th>标记</th>\n",
       "      <th>海报</th>\n",
       "      <th>片名</th>\n",
       "      <th>网址</th>\n",
       "      <th>评分</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>26752088</td>\n",
       "      <td>[徐峥, 王传君, 周一围, 谭卓, 章宇]</td>\n",
       "      <td>[文牧野]</td>\n",
       "      <td>45</td>\n",
       "      <td>https://img3.doubanio.com/view/photo/s_ratio_p...</td>\n",
       "      <td>我不是药神</td>\n",
       "      <td>https://movie.douban.com/subject/26752088/</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1295644</td>\n",
       "      <td>[让·雷诺, 娜塔莉·波特曼, 加里·奥德曼, 丹尼·爱罗, 彼得·阿佩尔]</td>\n",
       "      <td>[吕克·贝松]</td>\n",
       "      <td>45</td>\n",
       "      <td>https://img3.doubanio.com/view/photo/s_ratio_p...</td>\n",
       "      <td>这个杀手不太冷</td>\n",
       "      <td>https://movie.douban.com/subject/1295644/</td>\n",
       "      <td>9.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1292052</td>\n",
       "      <td>[蒂姆·罗宾斯, 摩根·弗里曼, 鲍勃·冈顿, 威廉姆·赛德勒, 克兰西·布朗]</td>\n",
       "      <td>[弗兰克·德拉邦特]</td>\n",
       "      <td>50</td>\n",
       "      <td>https://img3.doubanio.com/view/photo/s_ratio_p...</td>\n",
       "      <td>肖申克的救赎</td>\n",
       "      <td>https://movie.douban.com/subject/1292052/</td>\n",
       "      <td>9.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>26266893</td>\n",
       "      <td>[屈楚萧, 吴京, 李光洁, 吴孟达, 赵今麦]</td>\n",
       "      <td>[郭帆]</td>\n",
       "      <td>40</td>\n",
       "      <td>https://img3.doubanio.com/view/photo/s_ratio_p...</td>\n",
       "      <td>流浪地球</td>\n",
       "      <td>https://movie.douban.com/subject/26266893/</td>\n",
       "      <td>7.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1292720</td>\n",
       "      <td>[汤姆·汉克斯, 罗宾·怀特, 加里·西尼斯, 麦凯尔泰·威廉逊, 莎莉·菲尔德]</td>\n",
       "      <td>[罗伯特·泽米吉斯]</td>\n",
       "      <td>45</td>\n",
       "      <td>https://img3.doubanio.com/view/photo/s_ratio_p...</td>\n",
       "      <td>阿甘正传</td>\n",
       "      <td>https://movie.douban.com/subject/1292720/</td>\n",
       "      <td>9.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         ID                                         主演          导演  标记  \\\n",
       "0  26752088                     [徐峥, 王传君, 周一围, 谭卓, 章宇]       [文牧野]  45   \n",
       "0   1295644     [让·雷诺, 娜塔莉·波特曼, 加里·奥德曼, 丹尼·爱罗, 彼得·阿佩尔]     [吕克·贝松]  45   \n",
       "0   1292052   [蒂姆·罗宾斯, 摩根·弗里曼, 鲍勃·冈顿, 威廉姆·赛德勒, 克兰西·布朗]  [弗兰克·德拉邦特]  50   \n",
       "0  26266893                   [屈楚萧, 吴京, 李光洁, 吴孟达, 赵今麦]        [郭帆]  40   \n",
       "0   1292720  [汤姆·汉克斯, 罗宾·怀特, 加里·西尼斯, 麦凯尔泰·威廉逊, 莎莉·菲尔德]  [罗伯特·泽米吉斯]  45   \n",
       "\n",
       "                                                  海报       片名  \\\n",
       "0  https://img3.doubanio.com/view/photo/s_ratio_p...    我不是药神   \n",
       "0  https://img3.doubanio.com/view/photo/s_ratio_p...  这个杀手不太冷   \n",
       "0  https://img3.doubanio.com/view/photo/s_ratio_p...   肖申克的救赎   \n",
       "0  https://img3.doubanio.com/view/photo/s_ratio_p...     流浪地球   \n",
       "0  https://img3.doubanio.com/view/photo/s_ratio_p...     阿甘正传   \n",
       "\n",
       "                                           网址   评分  \n",
       "0  https://movie.douban.com/subject/26752088/  9.0  \n",
       "0   https://movie.douban.com/subject/1295644/  9.4  \n",
       "0   https://movie.douban.com/subject/1292052/  9.7  \n",
       "0  https://movie.douban.com/subject/26266893/  7.9  \n",
       "0   https://movie.douban.com/subject/1292720/  9.4  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### 解析单个页面，获取详细的电影信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def parse_movie_info(url,headers = headers,ip = ''):\n",
    "    if ip == '':\n",
    "        html = requests.get(url,headers = headers)\n",
    "    else:\n",
    "        html = requests.get(url,headers = headers,proxies = ip)\n",
    "    bs = etree.HTML(html.text)\n",
    "    #片名\n",
    "    title = bs.xpath('//div[@id = \"wrapper\"]/div/h1/span')[0].text  \n",
    "    #上映时间\n",
    "    year = bs.xpath('//div[@id = \"wrapper\"]/div/h1/span')[1].text   \n",
    "    #电影类型\n",
    "    m_type = []\n",
    "    for t in bs.xpath('//span[@property = \"v:genre\"]'):\n",
    "        m_type.append(t.text)   \n",
    "    a = bs.xpath('//div[@id= \"info\"]')[0].xpath('string()')\n",
    "    #片长\n",
    "    m_time =a[a.find('片长: ') + 4:a.find('分钟\\n')]  #时长\n",
    "    #地区\n",
    "    area = a[a.find('制片国家/地区:') + 9:a.find('\\n        语言')]  #地区\n",
    "    #评分人数\n",
    "    try:\n",
    "        people = bs.xpath('//a[@class = \"rating_people\"]/span')[0].text\n",
    "    #评分分布\n",
    "        rating = {}\n",
    "        rate_count = bs.xpath('//div[@class = \"ratings-on-weight\"]/div')\n",
    "        for rate in rate_count:\n",
    "            rating[rate.xpath('span/@title')[0]] = rate.xpath('span[@class = \"rating_per\"]')[0].text\n",
    "    except:\n",
    "        people = 'None'\n",
    "        rating = {}\n",
    "    #简介\n",
    "    try:\n",
    "        brief = bs.xpath('//span[@property = \"v:summary\"]')[0].text.strip('\\n                                \\u3000\\u3000')\n",
    "    except:\n",
    "        brief = 'None'\n",
    "    try:\n",
    "        hot_comment = bs.xpath('//div[@id = \"hot-comments\"]/div/div/p/span')[0].text\n",
    "    except:\n",
    "        hot_comment = 'None'\n",
    "    cache = pd.DataFrame({'片名':[title],'上映时间':[year],'电影类型':[m_type],'片长':[m_time],\n",
    "                          '地区':[area],'评分人数':[people],'评分分布':[rating],'简介':[brief],'热评':[hot_comment],'网址':[url]})\n",
    "    return cache"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 批量访问单个电影页面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "我们爬取了第:1部电影-------我不是药神\n",
      "我们爬取了第:2部电影-------这个杀手不太冷\n",
      "我们爬取了第:3部电影-------肖申克的救赎\n",
      "我们爬取了第:4部电影-------流浪地球\n",
      "我们爬取了第:5部电影-------阿甘正传\n",
      "我们爬取了第:6部电影-------复仇者联盟3：无限战争\n",
      "我们爬取了第:7部电影-------盗梦空间\n",
      "我们爬取了第:8部电影-------西虹市首富\n",
      "我们爬取了第:9部电影-------泰坦尼克号\n",
      "我们爬取了第:10部电影-------千与千寻\n",
      "我们爬取了第:11部电影-------霸王别姬\n",
      "我们爬取了第:12部电影-------三傻大闹宝莱坞\n",
      "我们爬取了第:13部电影-------让子弹飞\n",
      "我们爬取了第:14部电影-------怦然心动\n",
      "我们爬取了第:15部电影-------摔跤吧！爸爸\n",
      "我们爬取了第:16部电影-------毒液：致命守护者\n",
      "我们爬取了第:17部电影-------疯狂动物城\n",
      "我们爬取了第:18部电影-------忠犬八公的故事\n",
      "我们爬取了第:19部电影-------一出好戏\n",
      "我们爬取了第:20部电影-------当幸福来敲门\n",
      "我们爬取了第:21部电影-------海上钢琴师\n",
      "我们爬取了第:22部电影-------大话西游之大圣娶亲\n",
      "我们爬取了第:23部电影-------海王\n",
      "我们爬取了第:24部电影-------楚门的世界\n",
      "我们爬取了第:25部电影-------你的名字。\n",
      "我们爬取了第:26部电影-------阿凡达\n",
      "我们爬取了第:27部电影-------少年派的奇幻漂流\n",
      "我们爬取了第:28部电影-------星际穿越\n",
      "我们爬取了第:29部电影-------头号玩家\n",
      "我们爬取了第:30部电影-------无双\n",
      "我们爬取了第:31部电影-------放牛班的春天\n",
      "我们爬取了第:32部电影-------飞屋环游记\n",
      "我们爬取了第:33部电影-------机器人总动员\n",
      "我们爬取了第:34部电影-------那些年，我们一起追的女孩\n",
      "我们爬取了第:35部电影-------龙猫\n",
      "我们爬取了第:36部电影-------寻梦环游记\n",
      "我们爬取了第:37部电影-------红海行动\n",
      "我们爬取了第:38部电影-------初恋这件小事\n",
      "我们爬取了第:39部电影-------大话西游之月光宝盒\n",
      "我们爬取了第:40部电影-------无名之辈\n",
      "我们爬取了第:41部电影-------无间道\n",
      "我们爬取了第:42部电影-------天使爱美丽\n",
      "我们爬取了第:43部电影-------碟中谍6：全面瓦解\n",
      "我们爬取了第:44部电影-------剪刀手爱德华\n",
      "我们爬取了第:45部电影-------复仇者联盟\n",
      "我们爬取了第:46部电影-------战狼2\n",
      "我们爬取了第:47部电影-------美丽人生\n",
      "我们爬取了第:48部电影-------绿皮书\n",
      "我们爬取了第:49部电影-------飞驰人生\n",
      "我们爬取了第:50部电影-------罗马假日\n",
      "我们爬取了第:51部电影-------V字仇杀队\n",
      "我们爬取了第:52部电影-------唐伯虎点秋香\n",
      "我们爬取了第:53部电影-------夏洛特烦恼\n",
      "我们爬取了第:54部电影-------唐人街探案2\n",
      "我们爬取了第:55部电影-------动物世界\n",
      "我们爬取了第:56部电影-------辛德勒的名单\n",
      "我们爬取了第:57部电影-------芳华\n",
      "我们爬取了第:58部电影-------人再囧途之泰囧\n",
      "我们爬取了第:59部电影-------老炮儿\n",
      "我们爬取了第:60部电影-------釜山行\n",
      "我们爬取了第:61部电影-------蝴蝶效应\n",
      "我们爬取了第:62部电影-------神偷奶爸\n",
      "我们爬取了第:63部电影-------七宗罪\n",
      "我们爬取了第:64部电影-------邪不压正\n",
      "我们爬取了第:65部电影-------疯狂的外星人\n",
      "我们爬取了第:66部电影-------哈尔的移动城堡\n",
      "我们爬取了第:67部电影-------复仇者联盟4：终局之战\n",
      "我们爬取了第:68部电影-------蚁人2：黄蜂女现身\n",
      "我们爬取了第:69部电影-------失恋33天\n",
      "我们爬取了第:70部电影-------看不见的客人\n",
      "我们爬取了第:71部电影-------蝙蝠侠：黑暗骑士\n",
      "我们爬取了第:72部电影-------湄公河行动\n",
      "我们爬取了第:73部电影-------加勒比海盗\n",
      "我们爬取了第:74部电影-------本杰明·巴顿奇事\n",
      "我们爬取了第:75部电影-------喜剧之王\n",
      "我们爬取了第:76部电影-------西西里的美丽传说\n",
      "我们爬取了第:77部电影-------美人鱼\n",
      "我们爬取了第:78部电影-------中国合伙人\n",
      "我们爬取了第:79部电影-------小偷家族\n",
      "我们爬取了第:80部电影-------疯狂原始人\n",
      "我们爬取了第:81部电影-------触不可及\n",
      "我们爬取了第:82部电影-------钢铁侠\n",
      "我们爬取了第:83部电影-------后会无期\n",
      "我们爬取了第:84部电影-------超能陆战队\n",
      "我们爬取了第:85部电影-------黑天鹅\n",
      "我们爬取了第:86部电影-------北京遇上西雅图\n",
      "我们爬取了第:87部电影-------情书\n",
      "我们爬取了第:88部电影-------奇异博士\n",
      "我们爬取了第:89部电影-------教父\n",
      "我们爬取了第:90部电影-------血战钢锯岭\n",
      "我们爬取了第:91部电影-------天空之城\n",
      "我们爬取了第:92部电影-------功夫\n",
      "我们爬取了第:93部电影-------超时空同居\n",
      "我们爬取了第:94部电影-------禁闭岛\n",
      "我们爬取了第:95部电影-------银河护卫队\n",
      "我们爬取了第:96部电影-------倩女幽魂\n",
      "我们爬取了第:97部电影-------无问西东\n",
      "我们爬取了第:98部电影-------唐人街探案\n",
      "我们爬取了第:99部电影-------羞羞的铁拳\n",
      "我们爬取了第:100部电影-------复仇者联盟2：奥创纪元\n",
      "我们爬取了第:101部电影-------贫民窟的百万富翁\n",
      "我们爬取了第:102部电影-------搏击俱乐部\n",
      "我们爬取了第:103部电影-------源代码\n",
      "我们爬取了第:104部电影-------爱乐之城\n",
      "我们爬取了第:105部电影-------七月与安生\n",
      "我们爬取了第:106部电影-------闻香识女人\n",
      "我们爬取了第:107部电影-------狮子王\n",
      "我们爬取了第:108部电影-------沉默的羔羊\n",
      "我们爬取了第:109部电影-------穿普拉达的女王\n",
      "我们爬取了第:110部电影-------驴得水\n",
      "我们爬取了第:111部电影-------黑客帝国\n",
      "我们爬取了第:112部电影-------疯狂的石头\n",
      "我们爬取了第:113部电影-------哈利·波特与魔法石\n",
      "我们爬取了第:114部电影-------妖猫传\n",
      "我们爬取了第:115部电影-------美国队长3\n",
      "我们爬取了第:116部电影-------天才枪手\n",
      "我们爬取了第:117部电影-------我的少女时代\n",
      "我们爬取了第:118部电影-------敦刻尔克\n",
      "我们爬取了第:119部电影-------重庆森林\n",
      "我们爬取了第:120部电影-------低俗小说\n",
      "我们爬取了第:121部电影-------西游记之大圣归来\n",
      "我们爬取了第:122部电影-------人在囧途\n",
      "我们爬取了第:123部电影-------消失的爱人\n",
      "我们爬取了第:124部电影-------王牌特工：特工学院\n",
      "我们爬取了第:125部电影-------国王的演讲\n",
      "我们爬取了第:126部电影-------美国队长2\n",
      "我们爬取了第:127部电影-------美丽心灵\n",
      "我们爬取了第:128部电影-------熔炉\n",
      "我们爬取了第:129部电影-------影\n",
      "我们爬取了第:130部电影-------钢铁侠3\n",
      "我们爬取了第:131部电影-------指环王1：魔戒再现\n",
      "我们爬取了第:132部电影-------火星救援\n",
      "我们爬取了第:133部电影-------钢铁侠2\n",
      "我们爬取了第:134部电影-------蚁人\n",
      "我们爬取了第:135部电影-------傲慢与偏见\n",
      "我们爬取了第:136部电影-------致命魔术\n",
      "我们爬取了第:137部电影-------三块广告牌\n",
      "我们爬取了第:138部电影-------布达佩斯大饭店\n",
      "我们爬取了第:139部电影-------东邪西毒\n",
      "我们爬取了第:140部电影-------断背山\n",
      "我们爬取了第:141部电影-------银河护卫队2\n",
      "我们爬取了第:142部电影-------寻龙诀\n",
      "我们爬取了第:143部电影-------西游降魔篇\n",
      "我们爬取了第:144部电影-------秒速5厘米\n",
      "我们爬取了第:145部电影-------指环王3：王者无敌\n",
      "我们爬取了第:146部电影-------活着\n",
      "我们爬取了第:147部电影-------2012\n",
      "我们爬取了第:148部电影-------恐怖游轮\n",
      "我们爬取了第:149部电影-------蜘蛛侠：英雄归来\n",
      "我们爬取了第:150部电影-------告白\n",
      "我们爬取了第:151部电影-------功夫熊猫\n",
      "我们爬取了第:152部电影-------被嫌弃的松子的一生\n",
      "我们爬取了第:153部电影-------驯龙高手\n",
      "我们爬取了第:154部电影-------神奇动物：格林德沃之罪\n",
      "我们爬取了第:155部电影-------蝙蝠侠：黑暗骑士崛起\n",
      "我们爬取了第:156部电影-------冰雪奇缘\n",
      "我们爬取了第:157部电影-------哈利·波特与死亡圣器(下)\n",
      "我们爬取了第:158部电影-------冰川时代\n",
      "我们爬取了第:159部电影-------志明与春娇\n",
      "我们爬取了第:160部电影-------致命ID\n",
      "我们爬取了第:161部电影-------乘风破浪\n",
      "我们爬取了第:162部电影-------金陵十三钗\n",
      "我们爬取了第:163部电影-------美国队长\n",
      "我们爬取了第:164部电影-------黑豹\n",
      "我们爬取了第:165部电影-------哪吒之魔童降世\n",
      "我们爬取了第:166部电影-------雷神3：诸神黄昏\n",
      "我们爬取了第:167部电影-------指环王2：双塔奇兵\n",
      "我们爬取了第:168部电影-------勇敢的心\n",
      "我们爬取了第:169部电影-------天堂电影院\n",
      "我们爬取了第:170部电影-------惊天魔盗团\n",
      "我们爬取了第:171部电影-------致我们终将逝去的青春\n",
      "我们爬取了第:172部电影-------真爱至上\n",
      "我们爬取了第:173部电影-------射雕英雄传之东成西就\n",
      "我们爬取了第:174部电影-------大黄蜂\n",
      "我们爬取了第:175部电影-------怪兽电力公司\n",
      "我们爬取了第:176部电影-------捉妖记\n",
      "我们爬取了第:177部电影-------了不起的盖茨比\n",
      "我们爬取了第:178部电影-------速度与激情7\n",
      "我们爬取了第:179部电影-------死亡诗社\n",
      "我们爬取了第:180部电影-------阳光姐妹淘\n",
      "我们爬取了第:181部电影-------乱世佳人\n",
      "我们爬取了第:182部电影-------入殓师\n",
      "我们爬取了第:183部电影-------岁月神偷\n",
      "我们爬取了第:184部电影-------心灵捕手\n",
      "我们爬取了第:185部电影-------色，戒\n",
      "我们爬取了第:186部电影-------猫鼠游戏\n",
      "我们爬取了第:187部电影-------超体\n",
      "我们爬取了第:188部电影-------阳光灿烂的日子\n",
      "我们爬取了第:189部电影-------烈日灼心\n",
      "我们爬取了第:190部电影-------拯救大兵瑞恩\n",
      "我们爬取了第:191部电影-------蜘蛛侠：平行宇宙\n",
      "我们爬取了第:192部电影-------神奇动物在哪里\n",
      "我们爬取了第:193部电影-------X战警：逆转未来\n",
      "我们爬取了第:194部电影-------雷神\n",
      "我们爬取了第:195部电影-------调音师\n",
      "我们爬取了第:196部电影-------恋恋笔记本\n",
      "我们爬取了第:197部电影-------记忆碎片\n",
      "我们爬取了第:198部电影-------暮光之城\n",
      "我们爬取了第:199部电影-------雷神2：黑暗世界\n",
      "我们爬取了第:200部电影-------两小无猜\n"
     ]
    }
   ],
   "source": [
    "movie_result = pd.DataFrame()\n",
    "ip = ''  #这里构建自己的IP池\n",
    "count2 = 1\n",
    "cw = 1\n",
    "\n",
    "for url,name in zip(result['网址'].values,result['片名'].values):\n",
    "#for name,url in wrongs.items():\n",
    "    try:\n",
    "        cache = parse_movie_info(url,headers = headers,ip = ip)\n",
    "        movie_result = pd.concat([movie_result,cache])\n",
    "        #time.sleep(random.random())\n",
    "        print('我们爬取了第:%d部电影-------%s' % (count2,name))\n",
    "        count2 += 1\n",
    "    except:\n",
    "        print('滴滴滴滴滴，第{}次报错'.format(cw))\n",
    "        print('ip is:{}'.format(ip))\n",
    "        cw += 1\n",
    "        time.sleep(2)\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>上映时间</th>\n",
       "      <th>地区</th>\n",
       "      <th>热评</th>\n",
       "      <th>片名</th>\n",
       "      <th>片长</th>\n",
       "      <th>电影类型</th>\n",
       "      <th>简介</th>\n",
       "      <th>网址</th>\n",
       "      <th>评分人数</th>\n",
       "      <th>评分分布</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(2018)</td>\n",
       "      <td>中国大陆</td>\n",
       "      <td>“你敢保证你一辈子不得病？”纯粹、直接、有力！常常感叹：电影只能是电影。但每看到这样的佳作，...</td>\n",
       "      <td>我不是药神</td>\n",
       "      <td>117</td>\n",
       "      <td>[剧情, 喜剧]</td>\n",
       "      <td>普通中年男子程勇（徐峥 饰）经营着一家保健品店，失意又失婚。不速之客吕受益（王传君 饰）的到...</td>\n",
       "      <td>https://movie.douban.com/subject/26752088/</td>\n",
       "      <td>1174897</td>\n",
       "      <td>{'还行': '7.0%', '力荐': '57.4%', '较差': '0.5%', '很...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(1994)</td>\n",
       "      <td>法国</td>\n",
       "      <td>他们居然没做爱</td>\n",
       "      <td>这个杀手不太冷 Léon</td>\n",
       "      <td>110分钟(剧场版) / 133分钟(国际版)\\n        又名: 杀手莱昂 / 终极...</td>\n",
       "      <td>[剧情, 动作, 犯罪]</td>\n",
       "      <td>里昂（让·雷诺饰）是名孤独的职业杀手，受人雇佣。一天，邻居家小姑娘马蒂尔达（纳塔丽·波特曼饰...</td>\n",
       "      <td>https://movie.douban.com/subject/1295644/</td>\n",
       "      <td>1380628</td>\n",
       "      <td>{'还行': '3.2%', '力荐': '74.2%', '较差': '0.2%', '很...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(1994)</td>\n",
       "      <td>美国</td>\n",
       "      <td>关于希望最强有力的注释。</td>\n",
       "      <td>肖申克的救赎 The Shawshank Redemption</td>\n",
       "      <td>142</td>\n",
       "      <td>[剧情, 犯罪]</td>\n",
       "      <td>20世纪40年代末，小有成就的青年银行家安迪（蒂姆·罗宾斯 Tim Robbins 饰）因涉...</td>\n",
       "      <td>https://movie.douban.com/subject/1292052/</td>\n",
       "      <td>1525345</td>\n",
       "      <td>{'还行': '1.5%', '力荐': '84.6%', '较差': '0.1%', '很...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(2019)</td>\n",
       "      <td>中国大陆</td>\n",
       "      <td>1.终于，轮到我们仰望星空。2.后启示录死亡废墟，赛博朋克地下城，以及烟波浩渺的末日想象，缔...</td>\n",
       "      <td>流浪地球</td>\n",
       "      <td>125</td>\n",
       "      <td>[科幻, 灾难]</td>\n",
       "      <td>近未来，科学家们发现太阳急速衰老膨胀，短时间内包括地球在内的整个太阳系都将被太阳所吞没。为了...</td>\n",
       "      <td>https://movie.douban.com/subject/26266893/</td>\n",
       "      <td>1264654</td>\n",
       "      <td>{'还行': '22.0%', '力荐': '33.1%', '较差': '4.7%', '...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(1994)</td>\n",
       "      <td>美国</td>\n",
       "      <td>我生命里最温暖的一部电影</td>\n",
       "      <td>阿甘正传 Forrest Gump</td>\n",
       "      <td>142</td>\n",
       "      <td>[剧情, 爱情]</td>\n",
       "      <td>阿甘（汤姆·汉克斯 饰）于二战结束后不久出生在美国南方阿拉巴马州一个闭塞的小镇，他先天弱智，...</td>\n",
       "      <td>https://movie.douban.com/subject/1292720/</td>\n",
       "      <td>1192711</td>\n",
       "      <td>{'还行': '2.9%', '力荐': '76.0%', '较差': '0.2%', '很...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     上映时间    地区                                                 热评  \\\n",
       "0  (2018)  中国大陆  “你敢保证你一辈子不得病？”纯粹、直接、有力！常常感叹：电影只能是电影。但每看到这样的佳作，...   \n",
       "0  (1994)    法国                                            他们居然没做爱   \n",
       "0  (1994)    美国                                       关于希望最强有力的注释。   \n",
       "0  (2019)  中国大陆  1.终于，轮到我们仰望星空。2.后启示录死亡废墟，赛博朋克地下城，以及烟波浩渺的末日想象，缔...   \n",
       "0  (1994)    美国                                       我生命里最温暖的一部电影   \n",
       "\n",
       "                                片名  \\\n",
       "0                            我不是药神   \n",
       "0                     这个杀手不太冷 Léon   \n",
       "0  肖申克的救赎 The Shawshank Redemption   \n",
       "0                             流浪地球   \n",
       "0                阿甘正传 Forrest Gump   \n",
       "\n",
       "                                                  片长          电影类型  \\\n",
       "0                                                117      [剧情, 喜剧]   \n",
       "0  110分钟(剧场版) / 133分钟(国际版)\\n        又名: 杀手莱昂 / 终极...  [剧情, 动作, 犯罪]   \n",
       "0                                                142      [剧情, 犯罪]   \n",
       "0                                                125      [科幻, 灾难]   \n",
       "0                                                142      [剧情, 爱情]   \n",
       "\n",
       "                                                  简介  \\\n",
       "0  普通中年男子程勇（徐峥 饰）经营着一家保健品店，失意又失婚。不速之客吕受益（王传君 饰）的到...   \n",
       "0  里昂（让·雷诺饰）是名孤独的职业杀手，受人雇佣。一天，邻居家小姑娘马蒂尔达（纳塔丽·波特曼饰...   \n",
       "0  20世纪40年代末，小有成就的青年银行家安迪（蒂姆·罗宾斯 Tim Robbins 饰）因涉...   \n",
       "0  近未来，科学家们发现太阳急速衰老膨胀，短时间内包括地球在内的整个太阳系都将被太阳所吞没。为了...   \n",
       "0  阿甘（汤姆·汉克斯 饰）于二战结束后不久出生在美国南方阿拉巴马州一个闭塞的小镇，他先天弱智，...   \n",
       "\n",
       "                                           网址     评分人数  \\\n",
       "0  https://movie.douban.com/subject/26752088/  1174897   \n",
       "0   https://movie.douban.com/subject/1295644/  1380628   \n",
       "0   https://movie.douban.com/subject/1292052/  1525345   \n",
       "0  https://movie.douban.com/subject/26266893/  1264654   \n",
       "0   https://movie.douban.com/subject/1292720/  1192711   \n",
       "\n",
       "                                                评分分布  \n",
       "0  {'还行': '7.0%', '力荐': '57.4%', '较差': '0.5%', '很...  \n",
       "0  {'还行': '3.2%', '力荐': '74.2%', '较差': '0.2%', '很...  \n",
       "0  {'还行': '1.5%', '力荐': '84.6%', '较差': '0.1%', '很...  \n",
       "0  {'还行': '22.0%', '力荐': '33.1%', '较差': '4.7%', '...  \n",
       "0  {'还行': '2.9%', '力荐': '76.0%', '较差': '0.2%', '很...  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_result.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 文件存储"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result.to_excel('电影基本信息大全.xlsx')\n",
    "movie_result.to_excel('电影详细信息.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
